library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4 ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(here)
here() starts at C:/Users/Lenovo/Documents/RStudioProjects/UFO-Sightings---R-Project
library(withr)
ufo_sightings <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/ufo_sightings.csv')
Rows: 96429 Columns: 12── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): city, state, country_code, shape, reported_duration, summary, day_part
dbl (1): duration_seconds
lgl (1): has_images
dttm (2): reported_date_time, reported_date_time_utc
date (1): posted_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
places <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/places.csv')
Rows: 14417 Columns: 10── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): city, alternate_city_names, state, country, country_code, timezone
dbl (4): latitude, longitude, population, elevation_m
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
day_parts_map <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/day_parts_map.csv')
Rows: 26409 Columns: 12── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (2): rounded_lat, rounded_long
date (1): rounded_date
time (9): astronomical_twilight_begin, nautical_twilight_begin, civil_twilight_begin, sunrise, solar_noon, sunset, civil_twilight_end, nauti...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Checking if files have been loaded correctly
head(ufo_sightings)
head(places)
head(day_parts_map)
Saving data in respective files
dir.create(here("data", "2023", "2023-06-20"), recursive = TRUE, showWarnings = FALSE)
write_csv(ufo_sightings, here("data", "2023", "2023-06-20", "ufo_sightings.csv"))
write_csv(places, here("data", "2023", "2023-06-20", "places.csv"))
write_csv(day_parts_map, here("data", "2023", "2023-06-20", "day_parts_map.csv"))
Checking missing data
glimpse(ufo_sightings)
Rows: 96,429
Columns: 12
$ reported_date_time <dttm> 2022-08-29 06:03:00, 2022-08-20 01:51:00, 2022-08-13 05:30:00, 2022-08-06 21:00:00, 2022-08-04 07:40:00, 2022-…
$ reported_date_time_utc <dttm> 2022-08-29 06:03:00, 2022-08-20 01:51:00, 2022-08-13 05:30:00, 2022-08-06 21:00:00, 2022-08-04 07:40:00, 2022-…
$ posted_date <date> 2022-09-09, 2022-10-08, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 20…
$ city <chr> "Pinehurst", "Rapid City", "Cleveland", "Bloomington", "Irvine", "Moore", "Short Pump", "Norwalk", "Blayney", "…
$ state <chr> "NC", "MI", "OH", "IN", "CA", "OK", "VA", "CT", "New South Wales", "WY", "NH", "AZ", "FL", "OR", "Haryana", "NM…
$ country_code <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US…
$ shape <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ reported_duration <chr> "15 mins\u0085", "1 minute", "2 hours", "30 seconds", "3 minutes", "10 minutes", "20 seconds", "5 minutes", "90…
$ duration_seconds <dbl> 900, 60, 172800, 30, 180, 600, 20, 300, 120, 1800, 10, 3, 45, 60, 240, 32, 300, 600, 180, 1200, 45, 300, 180, 1…
$ summary <chr> "Saw multi color object above horizon.", "An object in the shape of a straight line about an inch from our view…
$ has_images <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
$ day_part <chr> "night", "nautical dusk", "night", "afternoon", "night", "morning", "morning", "afternoon", NA, "astronomical d…
glimpse(places)
Rows: 14,417
Columns: 10
$ city <chr> "Pinehurst", "Rapid City", "Cleveland", "Bloomington", "Irvine", "Moore", "Short Pump", "Norwalk", "Blayney", "Gr…
$ alternate_city_names <chr> "Pajnkherst,bynhwrst,pynhwrst karwlynay shmaly,Пајнхерст,بينهورست,پینهورست، کارولینای شمالی", NA, "CLE,Cleavelan…
$ state <chr> "NC", "MI", "OH", "IN", "CA", "OK", "VA", "CT", "New South Wales", "WY", "NH", "AZ", "FL", "OR", "Haryana", "NM",…
$ country <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "Australia", "USA", "USA", "USA", "USA", "USA", "India", …
$ country_code <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US",…
$ latitude <dbl> 35.19543, 44.83445, 41.49950, 39.16533, 33.66946, 35.33951, 37.65042, 41.11760, -33.53233, 44.48912, 44.05368, 33…
$ longitude <dbl> -79.46948, -85.28256, -81.69541, -86.52639, -117.82311, -97.48670, -77.61249, -73.40790, 149.25367, -108.05621, -…
$ timezone <chr> "America/New_York", "America/Detroit", "America/New_York", "America/Indiana/Indianapolis", "America/Los_Angeles",…
$ population <dbl> 15752, 1352, 388072, 84067, 256927, 60451, 24729, 88485, 3355, 1879, 2349, 1608139, 71579, 7912, 197340, 559121, …
$ elevation_m <dbl> 160, 192, 199, 235, 17, 382, 89, 11, NA, 1156, 160, 331, 1, 157, NA, 1511, 414, 2, 393, 89, 1373, 264, 1, 89, 220…
glimpse(day_parts_map)
Rows: 26,409
Columns: 12
$ rounded_lat <dbl> 40, 40, 40, 40, 30, 40, 40, 40, -30, 40, 40, 30, 30, 40, 30, 40, 30, 30, 40, 30, 40, 30, 40, 30, 50, 40, 2…
$ rounded_long <dbl> -80, -90, -80, -90, -120, -100, -80, -70, 150, -110, -70, -110, -80, -120, 80, -110, -120, -80, -100, -120…
$ rounded_date <date> 2022-08-28, 2022-08-21, 2022-08-14, 2022-08-07, 2022-08-07, 2022-07-24, 2022-07-17, 2022-07-17, 2022-07-1…
$ astronomical_twilight_begin <time> 09:07:43, 09:38:33, 08:48:54, 09:19:00, 11:55:02, 09:39:05, 08:09:39, 07:29:37, 19:30:59, 10:01:25, 07:21…
$ nautical_twilight_begin <time> 09:42:49, 10:14:53, 09:26:40, 09:58:23, 12:26:50, 10:22:03, 08:54:27, 08:14:25, 20:00:01, 10:47:51, 08:07…
$ civil_twilight_begin <time> 10:16:18, 10:49:12, 10:01:56, 10:34:41, 12:57:22, 11:00:33, 09:34:01, 08:54:00, 20:29:37, 11:28:21, 08:48…
$ sunrise <time> 10:42:53, 11:16:15, 10:29:32, 11:02:53, 13:21:39, 11:30:01, 10:04:06, 09:24:04, 20:54:22, 11:58:55, 09:18…
$ solar_noon <time> 17:21:10, 18:03:05, 17:24:39, 18:05:45, 20:05:45, 18:46:32, 17:26:12, 16:46:12, 02:05:20, 19:25:26, 16:45…
$ sunset <time> 23:59:27, 00:49:55, 00:19:45, 01:08:37, 02:49:51, 02:03:03, 00:48:18, 00:08:20, 07:16:18, 02:51:58, 00:12…
$ civil_twilight_end <time> 00:26:02, 01:16:58, 00:47:21, 01:36:50, 03:14:07, 02:32:31, 01:18:23, 00:38:24, 07:41:03, 03:22:32, 00:42…
$ nautical_twilight_end <time> 00:59:31, 01:51:17, 01:22:37, 02:13:08, 03:44:39, 03:11:02, 01:57:57, 01:17:59, 08:10:40, 04:03:02, 01:23…
$ astronomical_twilight_end <time> 01:34:37, 02:27:37, 02:00:24, 02:52:31, 04:16:27, 03:54:00, 02:42:45, 02:02:47, 08:39:42, 04:49:28, 02:09…
colSums(is.na(ufo_sightings))
reported_date_time reported_date_time_utc posted_date city state country_code
0 0 0 0 85 0
shape reported_duration duration_seconds summary has_images day_part
2039 0 0 31 0 2563
colSums(is.na(places))
city alternate_city_names state country country_code latitude
0 2953 32 0 0 0
longitude timezone population elevation_m
0 0 0 2285
colSums(is.na(day_parts_map))
rounded_lat rounded_long rounded_date astronomical_twilight_begin nautical_twilight_begin
0 0 0 951 122
civil_twilight_begin sunrise solar_noon sunset civil_twilight_end
2 2 0 2 2
nautical_twilight_end astronomical_twilight_end
122 951
Summary of the data
dim(ufo_sightings)
[1] 96429 12
summary(ufo_sightings)
reported_date_time reported_date_time_utc posted_date city state
Min. :1925-12-29 00:00:00.00 Min. :1925-12-29 00:00:00.00 Min. :1998-03-07 Length:96429 Length:96429
1st Qu.:2004-10-01 05:10:00.00 1st Qu.:2004-10-01 05:10:00.00 1st Qu.:2006-10-30 Class :character Class :character
Median :2012-02-05 03:00:00.00 Median :2012-02-05 03:00:00.00 Median :2012-08-19 Mode :character Mode :character
Mean :2009-04-30 02:41:30.98 Mean :2009-04-30 02:41:30.98 Mean :2011-09-26
3rd Qu.:2016-01-25 03:30:00.00 3rd Qu.:2016-01-25 03:30:00.00 3rd Qu.:2016-07-15
Max. :2023-05-18 19:27:00.00 Max. :2023-05-18 19:27:00.00 Max. :2023-05-19
country_code shape reported_duration duration_seconds summary has_images day_part
Length:96429 Length:96429 Length:96429 Min. :0.000e+00 Length:96429 Mode :logical Length:96429
Class :character Class :character Class :character 1st Qu.:3.000e+01 Class :character FALSE:96429 Class :character
Mode :character Mode :character Mode :character Median :1.800e+02 Mode :character Mode :character
Mean :3.161e+04
3rd Qu.:6.000e+02
Max. :1.987e+09
dim(places)
[1] 14417 10
summary(places)
city alternate_city_names state country country_code latitude longitude
Length:14417 Length:14417 Length:14417 Length:14417 Length:14417 Min. :-53.15 Min. :-170.48
Class :character Class :character Class :character Class :character Class :character 1st Qu.: 34.99 1st Qu.: -95.46
Mode :character Mode :character Mode :character Mode :character Mode :character Median : 40.09 Median : -84.21
Mean : 37.76 Mean : -75.36
3rd Qu.: 42.96 3rd Qu.: -74.82
Max. : 70.64 Max. : 179.19
timezone population elevation_m
Length:14417 Min. : 0 Min. : -57.0
Class :character 1st Qu.: 1926 1st Qu.: 65.0
Mode :character Median : 6085 Median : 194.0
Mean : 86375 Mean : 288.2
3rd Qu.: 21993 3rd Qu.: 304.0
Max. :22315474 Max. :3097.0
NA's :2285
dim(day_parts_map)
[1] 26409 12
summary(day_parts_map)
rounded_lat rounded_long rounded_date astronomical_twilight_begin nautical_twilight_begin civil_twilight_begin
Min. :-50.00 Min. :-170.00 Min. :1925-12-27 Length:26409 Length:26409 Length:26409
1st Qu.: 30.00 1st Qu.:-110.00 1st Qu.:1999-01-17 Class1:hms Class1:hms Class1:hms
Median : 40.00 Median : -90.00 Median :2007-03-25 Class2:difftime Class2:difftime Class2:difftime
Mean : 36.23 Mean : -80.01 Mean :2004-03-18 Mode :numeric Mode :numeric Mode :numeric
3rd Qu.: 40.00 3rd Qu.: -80.00 3rd Qu.:2014-10-12
Max. : 70.00 Max. : 180.00 Max. :2023-05-21
sunrise solar_noon sunset civil_twilight_end nautical_twilight_end astronomical_twilight_end
Length:26409 Length:26409 Length:26409 Length:26409 Length:26409 Length:26409
Class1:hms Class1:hms Class1:hms Class1:hms Class1:hms Class1:hms
Class2:difftime Class2:difftime Class2:difftime Class2:difftime Class2:difftime Class2:difftime
Mode :numeric Mode :numeric Mode :numeric Mode :numeric Mode :numeric Mode :numeric
sum(duplicated(ufo_sightings))
[1] 3
Due to the large size of the file on which the analysis is performed, only some of the data was used for visual representation.
library(tidyverse)
library(here)
library(withr)
library(naniar)
ufo_sightings %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
Replacing missing data with most frequent entry and saving cleaned data
most_common_day_part <- ufo_sightings %>%
count(day_part) %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(day_part)
most_common_shape <- ufo_sightings %>%
count(shape) %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(shape)
# najczestrza wartosc
ufo_clean <- ufo_sightings %>%
mutate(
day_part = ifelse(is.na(day_part), most_common_day_part, day_part),
shape = ifelse(is.na(shape), most_common_shape, shape)
)
write_csv(ufo_clean, here("data", "2023", "2023-06-20", "ufo_clean.csv"))
ufo_clean %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
places %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
Replacing missing city name with empty string, elevation with mean value and saving cleaned data
#nie mam lepszego pomysłu na uzuepłenienie niż " "
places_clean <- places %>%
mutate(
alternate_city_names = ifelse(is.na(alternate_city_names), " ", alternate_city_names)
)
median_elevation <- median(places$elevation_m, na.rm = TRUE)
places_clean <- places_clean %>%
mutate(
elevation_m = ifelse(is.na(elevation_m), median_elevation, elevation_m)
)
write_csv(places_clean, here("data", "2023", "2023-06-20", "places_clean.csv"))
places_clean %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
day_parts_map %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
Replacing missing data with mean value and saving cleaned data
day_parts_clean <- day_parts_map %>%
mutate(
astronomical_twilight_begin = ifelse(
is.na(astronomical_twilight_begin),
median(astronomical_twilight_begin, na.rm = TRUE),
astronomical_twilight_begin
),
astronomical_twilight_end = ifelse(
is.na(astronomical_twilight_end),
median(astronomical_twilight_end, na.rm = TRUE),
astronomical_twilight_end
)
)
write_csv(day_parts_clean, here("data", "2023", "2023-06-20", "day_parts_clean.csv"))
day_parts_clean %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
ufo_model_data <- ufo_clean %>%
filter(!is.na(shape), !is.na(reported_duration), !is.na(summary))
places_model_data <- places_clean %>%
filter(!is.na(alternate_city_names))
glimpse(ufo_model_data)
glimpse(places_model_data)
glimpse(day_parts_clean)
write_csv(ufo_model_data, here("data", "2023", "2023-06-20", "ufo_model_data.csv"))
write_csv(places_model_data, here("data", "2023", "2023-06-20", "places_model_data.csv"))
library(lubridate)
library(dplyr)
library(hms)
Dołączanie pakietu: ‘hms’
Następujący obiekt został zakryty z ‘package:lubridate’:
hms
Adding new columns to “sightings” dataframe
ufo_model_data_mutated <- ufo_model_data %>%
mutate(
year = year(reported_date_time),
month = month(reported_date_time),
weekday = wday(reported_date_time, label = TRUE, abbr = FALSE, locale = "C"),
is_weekend = weekday %in% c("Sat", "Sun"),
country_upper = toupper(country_code),
report_hour = hour(reported_date_time),
city_state = paste(city, state, sep = ", "),
report_delay_days = as.numeric(difftime(posted_date, as.Date(reported_date_time), units = "days"))
)
ufo_model_data_mutated
year: Extracts the year from the
reported_date_time.month: Extracts the month (1–12) from the report
time-stamp.weekday: Returns the weekday name from the date.is_weekend: Logical column: TRUE if the
day is Saturday or Sunday, FALSE otherwise.country_upper: Converts the country_code
to uppercase.report_hour: Extracts the hour (0–23) from the report
time-stamp.city_state: Concatenates city and
state into a single string.report_delay_days: Calculates the delay in days between
when the event was reported and when it was posted.Adding new columns to “places” dataframe
places_model_data_mutated <- places_model_data %>%
mutate(
city_state = paste(city, state, sep = ", "),
is_us = country_code == "US",
population_log = log1p(population),
hemisphere = ifelse(latitude >= 0, "Northern", "Southern"),
is_coastal = abs(longitude) < 80 | abs(longitude) > 120,
pop_category = case_when(
population < 10000 ~ "small",
population < 100000 ~ "medium",
TRUE ~ "large"
),
elevation_category = case_when(
is.na(elevation_m) ~ "unknown",
elevation_m < 100 ~ "low",
elevation_m < 500 ~ "medium",
TRUE ~ "high"
),
name_length = nchar(city),
timezone_area = sapply(strsplit(timezone, "/"), `[`, 2)
)
places_model_data_mutated
city_state: Combines city and
state into a single string.is_us: Logical value: TRUE if the location
is in the United States else FALSE.population_log: Log-transformed population.hemisphere: "Northern" if latitude is ≥ 0,
"Southern" otherwise.is_coastal: Logical: TRUE if longitude is
outside the range [80, 120] in absolute value — a rough coastal
proxy.pop_category: Categorizes places based on population:
"small", "medium", or
"large".elevation_category: Classifies elevation:
"low" (<100 m), "medium" (<500 m),
"high" (≥500 m), or "unknown" if NA.name_length: The number of characters in the city
name.timezone_area: Extracts the second part of the timezone
string.Adding new columns to “day parts” dataframe
day_parts_model_mutated <- day_parts_clean %>%
mutate(
daylight_duration = as.numeric(sunset - sunrise, units = "secs"),
is_northern_hemisphere = rounded_lat >= 0,
sunrise_hour = hour(sunrise),
sunset_hour = hour(sunset),
is_day_short = daylight_duration < 36000, # mniej niż 10h
twilight_duration = as.numeric(astronomical_twilight_end - astronomical_twilight_begin, units = "secs"),
is_long_twilight = twilight_duration > 5400, # 1.5h
sunrise_minutes = hour(sunrise) * 60 + minute(sunrise),
solar_noon_minutes = hour(solar_noon) * 60 + minute(solar_noon),
sunset_minutes = hour(sunset) * 60 + minute(sunset)
)
day_parts_model_mutated
daylight_duration: The length of the day in seconds —
difference between sunset and sunrise.is_northern_hemisphere: Logical: TRUE if
the location is in the Northern Hemisphere.sunrise_hour: The hour (0–23) when the sun rises.sunset_hour: The hour (0–23) when the sun sets.is_day_short: Logical: TRUE if the day is
shorter than 10 hourstwilight_duration: Duration of astronomical twilight in
seconds — time between astronomical_twilight_begin and
end.is_long_twilight: Logical: TRUE if
twilight duration is longer than 1.5 hourssunrise_minutes: Sunrise time in total minutes from
midnight.solar_noon_minutes: Solar noon time in minutes from
midnight.sunset_minutes: Sunset time in minutes from
midnight.library(ggplot2)
library(dplyr)
library(sf)
Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(rnaturalearth)
library(rnaturalearthdata)
Dołączanie pakietu: ‘rnaturalearthdata’
Następujący obiekt został zakryty z ‘package:rnaturalearth’:
countries110
library(ggplot2)
library(dplyr)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
##Number of sightings per day
ufo_model_data_mutated %>%
count(date = as.Date(reported_date_time)) %>%
ggplot(aes(x = date, y = n)) +
geom_line(color = "steelblue") +
labs(title = "Number of sightings per day", x = "Date", y = "Number of sightings")
Interpretation:
The chart shows daily UFO sightings over time. Sightings were rare before 1960, gradually increased through the 1990s, and peaked between 2000 and 2015. After 2015, the number of reports declined sharply. This suggests that UFO sightings may be influenced by media, public interest, or reporting practices.
ufo_model_data_mutated %>%
mutate(year = lubridate::year(reported_date_time)) %>%
count(year) %>%
ggplot(aes(x = year, y = n)) +
geom_line(color = "darkblue") +
geom_smooth(se = FALSE, color = "red", method = "loess") +
labs(
title = "Annual Trend of UFO Sightings",
x = "Year",
y = "Number of Sightings"
) +
theme_minimal()
Interpretation:
The chart shows a clear rise in UFO sightings from the 1980s to around 2012, with a peak near 2014. After that, there’s a sharp decline in reports. The red loess curve highlights a long-term upward trend followed by a recent downward shift. This may reflect changes in reporting behavior, public interest, or data availability over time.
##Number of sightings depending on the day of the week
ufo_model_data_mutated %>%
count(weekday) %>%
ggplot(aes(x = weekday, y = n)) +
geom_col(fill = "orange") +
labs(title = "Sightings depending on the day of the week", x = "Day of the week", y = "Number of sightings")
Interpretation:
The number of UFO sightings varies by day of the week. The highest counts occur on Saturdays and Sundays, while Tuesdays have the fewest reports. This suggests people are more likely to notice and report sightings during weekends, possibly due to having more free time or being outdoors more often.
ufo_model_data_mutated %>%
mutate(hour = hour(reported_date_time)) %>%
count(hour) %>%
ggplot(aes(x = hour, y = n)) +
geom_col(fill = "purple") +
labs(title = "Hourly distribution of sightings", x = "Hour of the day", y = "Number of sightings")
Interpretation:
UFO sightings are most frequently reported between 8 PM and 3 AM, peaking around 2 AM. Sightings are least common during midday hours. This pattern suggests that sightings are more likely to occur—or at least be noticed and reported—at night, when the sky is dark and unusual lights are more visible.
ufo_model_data_mutated %>%
mutate(
hour = hour(reported_date_time),
weekday = fct_relevel(weekday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))
) %>%
count(weekday, hour) %>%
ggplot(aes(x = hour, y = weekday, fill = n)) +
geom_tile(color = "white") +
scale_fill_viridis_c() +
labs(title = "Heatmap: day of the week vs hour of the day", x = "Hour of the day", y = "Day of the week", fill = "Number of sightings")
Interpretation:
The heatmap shows the distribution of UFO sightings by hour of the day and day of the week. Most sightings occur after midnight on Sunday, peaking between 1–3 AM. Other late-night hours, especially on weekends, also show elevated counts. This pattern reinforces that sightings are more frequent during late-night weekend hours, when people are likely to be awake and outdoors in dark conditions.
ufo_model_data_mutated %>%
mutate(image_status = ifelse(has_images, "Has an image", "Has no image")) %>%
count(image_status) %>%
ggplot(aes(x = "", y = n, fill = image_status)) +
geom_col(width = 1) +
coord_polar(theta = "y") +
labs(title = "Sightins with images vs no image", fill = "Image existence") +
theme_void() +
scale_fill_manual(values = c("Has an image" = "#66BB6A", "Has no image" = "#EF5350"))
Interpretation:
The chart shows that almost all UFO sightings lack images. Sightings with images are extremely rare, suggesting that reports are usually text-based or anecdotal. This indicates a strong reliance on witness testimony rather than visual evidence in the dataset.
Making sure if above piechart is correct
sum(ufo_model_data_mutated$has_images != FALSE, na.rm = TRUE)
ufo_model_data_mutated %>%
count(shape) %>%
ggplot(aes(x = reorder(shape, n), y = n)) +
geom_col(fill = "skyblue") +
coord_flip() +
labs(title = "Number of sightings per shape", x = "Shape", y = "Number of sightings")
Interpretation:
The most commonly reported UFO shapes are light, circle, and triangle. Unusual shapes like cube, star, and cross are very rare. This suggests that most sightings describe simple or glowing forms, possibly influenced by visibility, perception, or common cultural imagery.
ufo_model_data_mutated %>%
count(country_code) %>%
filter(n >= 100) %>%
ggplot(aes(x = reorder(country_code, n), y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(
title = "Number of sightings per country",
x = "Country code",
y = "Number of sightings"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Interpretation:
The vast majority of UFO sightings come from the United States, with over 80,000 reports. Other countries like Canada and Great Britain have significantly fewer sightings. This suggests that the dataset is strongly US-centric, possibly due to better reporting infrastructure, public interest, or data source bias.
world <- ne_countries(scale = "medium", returnclass = "sf")
places_clean %>%
filter(!is.na(latitude), !is.na(longitude)) %>%
mutate(
pop_category = case_when(
population < 10000 ~ "small",
population < 100000 ~ "medium",
TRUE ~ "large"
)
) %>%
ggplot() +
geom_sf(data = world, fill = "lightgray", color = "black") + # Dodajemy mapę
geom_point(aes(
x = longitude, y = latitude,
size = population, color = pop_category
), alpha = 0.6) +
scale_size(range = c(1, 6), guide = "none") +
labs(
title = "Cities with UFO sightings",
subtitle = "Point size ~ population, color ~ population category",
x = "Latitude",
y = "Altitude",
color = "Population category"
) +
theme_minimal()
Interpretation:
Sightings are most densely clustered in North America and Europe, especially in large urban areas. This suggests that population density and infrastructure may influence reporting frequency. Other regions show fewer reports, which could reflect lower reporting access or less data availability.
ufo_model_data_mutated %>%
left_join(places_model_data_mutated %>% select(city_state, pop_category), by = "city_state") %>%
count(pop_category) %>%
ggplot(aes(x = pop_category, y = n, fill = pop_category)) +
geom_col(show.legend = FALSE) +
labs(
title = "UFO Sightings by City Population Category",
x = "Population Category",
y = "Number of Sightings"
) +
scale_fill_manual(values = c("small" = "#91bfdb", "medium" = "#fdae61", "large" = "#d73027")) +
theme_minimal()
Ostrzeżenie: Detected an unexpected many-to-many relationship between `x` and `y`.
Interpretation:
The chart shows that most UFO sightings come from medium-sized cities, followed by large cities, with small towns reporting the fewest. This suggests that mid-sized urban areas may offer a balance of visibility, outdoor activity, and public engagement conducive to sightings. It also reflects where people live and are most likely to report unusual events.